Chapter 6 Examples and Comments

In the following I include my feedback keeping anonymity.

6.1 Information Obtained by extra = TRUE

The first way is to use the option extra = TRUE when you import the data using WDI.

6.1.1 Population of Zimbabwe and Similar Countries

df_population <- as_tibble(WDI(
  country = "all",
  indicator = c(population = "SP.POP.TOTL"),
  start = 1960,
  end = 2020,
  extra = TRUE
))
DT::datatable(df_population)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Find out extra information on Zimbabwe, i.e., which group Zimbabwe is in for region, income and lending.

df_population %>%
  filter(country == "Zimbabwe") %>% select(region, income, lending) %>% distinct()
## # A tibble: 1 × 3
##   region             income              lending
##   <chr>              <chr>               <chr>  
## 1 Sub-Saharan Africa Lower middle income Blend

Find out all countries in the same groups as Zimbabwe.

df_population %>% 
  filter(region == "Sub-Saharan Africa", income == "Lower middle income", lending == "Blend") %>% distinct(country)
## # A tibble: 6 × 1
##   country    
##   <chr>      
## 1 Congo, Rep.
## 2 Cameroon   
## 3 Cabo Verde 
## 4 Kenya      
## 5 Nigeria    
## 6 Zimbabwe

Since there are only 6 countries belonging to the same group as Zimbabwe, let us widen our search.

df_population %>% 
  filter(region == "Sub-Saharan Africa", income == "Lower middle income") %>% distinct(country)
## # A tibble: 17 × 1
##    country              
##    <chr>                
##  1 Angola               
##  2 Benin                
##  3 Congo, Rep.          
##  4 Cote d'Ivoire        
##  5 Cameroon             
##  6 Cabo Verde           
##  7 Ghana                
##  8 Kenya                
##  9 Comoros              
## 10 Lesotho              
## 11 Mauritania           
## 12 Nigeria              
## 13 Senegal              
## 14 Sao Tome and Principe
## 15 Eswatini             
## 16 Tanzania             
## 17 Zimbabwe

The following is the first chart.

df_population %>% 
  filter(region == "Sub-Saharan Africa", income == "Lower middle income", lending == "Blend") %>%
  ggplot() +
  geom_line(aes(x = year, y = population, color = country)) +
  scale_y_log10()

The second chart.

df_population %>% 
  filter(region == "Sub-Saharan Africa", income == "Lower middle income") %>%
  ggplot() +
  geom_line(aes(x = year, y = population, color = country)) +
  scale_y_log10()

Fixing the income level and compare the regions.

df_population %>% filter(income == "Lower middle income")  %>%
  group_by(region, year) %>% 
  summarize(average_in_region = mean(population, na.rm = TRUE)) %>%
  ggplot() +
  geom_line(aes(x = year, y = average_in_region, color = region))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

A chart in log 10 scale.

df_population %>% filter(income == "Lower middle income")  %>%
  group_by(region, year) %>% 
  summarize(average_in_region = mean(population, na.rm = TRUE)) %>%
  ggplot() +
  geom_line(aes(x = year, y = average_in_region, color = region)) +
  scale_y_log10()
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

6.1.2 Fertility Rate of Japan and Similar Countries

df_fertility <- as_tibble(WDI(
  country = "all",
  indicator = c(fertility = "SP.DYN.TFRT.IN"), 
  start = 1960,
  end = 2020,
  extra = TRUE))
DT::datatable(df_fertility)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
df_fertility %>% filter(country == "Japan") %>%
  select(region, income, lending) %>% distinct()
## # A tibble: 1 × 3
##   region              income      lending       
##   <chr>               <chr>       <chr>         
## 1 East Asia & Pacific High income Not classified

Let us compare Japan’s fertility rate with similar countries, i.e., high income countries in East Asia & Pacific

df_fertility %>% filter(region == "East Asia & Pacific", income == "High income") %>% distinct(country)
## # A tibble: 13 × 1
##    country                 
##    <chr>                   
##  1 Australia               
##  2 Brunei Darussalam       
##  3 Guam                    
##  4 Hong Kong SAR, China    
##  5 Japan                   
##  6 Korea, Rep.             
##  7 Macao SAR, China        
##  8 Northern Mariana Islands
##  9 New Caledonia           
## 10 Nauru                   
## 11 New Zealand             
## 12 French Polynesia        
## 13 Singapore
df_fertility %>% filter(region == "East Asia & Pacific", income == "High income")  %>%
  ggplot() + 
  geom_line(aes(x = year, y = fertility, color = country)) +
  labs(title = "Fertility Rate of High Income Countries in East Asia & Pacific")
## Warning: Removed 122 row(s) containing missing values (geom_path).

Let us check the number of countries in each region.

df_fertility %>% filter(income == "High income")  %>%
  select(country, region) %>% distinct() %>%
  group_by(region) %>% summarize(number_of_countries = n_distinct(country))
## # A tibble: 6 × 2
##   region                     number_of_countries
##   <chr>                                    <int>
## 1 East Asia & Pacific                         13
## 2 Europe & Central Asia                       37
## 3 Latin America & Caribbean                   17
## 4 Middle East & North Africa                   8
## 5 North America                                3
## 6 Sub-Saharan Africa                           1
df_fertility %>% filter(income == "High income")  %>%
  group_by(region, year) %>% 
  summarize(average_in_region = mean(fertility, na.rm = TRUE)) %>%
  ggplot() +
  geom_line(aes(x = year, y = average_in_region, color = region))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
## Warning: Removed 16 row(s) containing missing values (geom_path).

6.2 Other Groups

So far we used only three classifications of groups, region, income and lending. However, CLASS.xls contains more information.

For example if we compare data within a group, say Arab World (ARB), we need the information taken from CLASS.xls. If you need the information of the aggregated data of the group, it is included. We know that there are 22 countries.

wb_arb <- wb_groups %>% filter(gcode == "ARB") %>% select(iso3c, country)
wb_arb_vec <- wb_arb %>% pull(iso3c)
wb_arb
## # A tibble: 22 × 2
##    iso3c country             
##    <chr> <chr>               
##  1 ARE   United Arab Emirates
##  2 BHR   Bahrain             
##  3 COM   Comoros             
##  4 DJI   Djibouti            
##  5 DZA   Algeria             
##  6 EGY   Egypt, Arab Rep.    
##  7 IRQ   Iraq                
##  8 JOR   Jordan              
##  9 KWT   Kuwait              
## 10 LBN   Lebanon             
## # … with 12 more rows
wb_arb_vec
##  [1] "ARE" "BHR" "COM" "DJI" "DZA" "EGY" "IRQ" "JOR" "KWT" "LBN" "LBY" "MAR"
## [13] "MRT" "OMN" "PSE" "QAT" "SAU" "SDN" "SOM" "SYR" "TUN" "YEM"

I created the data frame containing Arab World countries and their iso3c codes together with a vector containing iso3c only.

6.2.1 iso3c vector: wb_arb_vec

df_fertility %>% filter(iso3c %in% wb_arb_vec) %>%
  ggplot(aes(x = year, y = fertility, color = country)) +
  geom_line()
## Warning: Removed 30 row(s) containing missing values (geom_path).

### Data Frame: wb_arb

For example, if the original data does not have an iso3c code, you can obtain a part of data refering to the column they share in common. In the following I used by = "country" in the second code chunk.

df_fertility %>% right_join(wb_arb) %>%
  ggplot(aes(x = year, y = fertility, color = country)) +
  geom_line() + 
  labs(title = "right_join without using by")
## Joining, by = c("country", "iso3c")
## Warning: Removed 30 row(s) containing missing values (geom_path).

df_fertility %>% right_join(wb_arb, by = "country") %>%
  ggplot(aes(x = year, y = fertility, color = country)) +
  geom_line()  + 
  labs(title = "`right_join` with `by`")
## Warning: Removed 30 row(s) containing missing values (geom_path).

6.3 Wangling: Transform and Tidy Data

url_class <- "https://api.worldbank.org/v2/en/indicator/EN.ATM.CO2E.PC?downloadformat=excel"
download.file(url = url_class, mode = "wb", destfile = "data/API_EN.ATM.CO2E.PC_DS2_en_excel_v2_3469464.xls")

6.3.1 Tidying data

country_tmp <- read_excel("data/API_EN.ATM.CO2E.PC_DS2_en_excel_v2_3469464.xls", sheet = 1, skip = 3, n_max =271) %>% slice(-1)
DT::datatable(country_tmp)

The following is an original code chunk.

Co2_country <- country_tmp %>%
  select(`Country Name`, `Country Code`, `Indicator Name`, `Indicator Code`,`1960`:`2018`)
Co2_country <- pivot_longer(Co2_country,cols = 5:63, names_to = "year", values_to = "Co2", values_drop_na = TRUE) %>%
  select(ID = `Country Code`, Country = `Country Name`, year, Co2)
DT::datatable(Co2_country)
Co2_country %>% filter(ID == "WLD") %>% 
  ggplot(aes(x = year, y = Co2)) +
  geom_line()
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Empty chart. The following is much better but we cannot see year and a line graph may be preferable to see changes. Please notice that year is in character not in integer.

Co2_country %>% filter(ID == "WLD") %>% 
  ggplot(aes(x = year, y = Co2)) +
  geom_point()

In the following I used Base R command. Co2_country_rev$year denotes the column year of the data frame Co2_country_rev.

Co2_country_rev <- Co2_country
Co2_country_rev$year <- as.integer(Co2_country_rev$year) 
Co2_country_rev %>% filter(ID == "WLD") %>% 
  ggplot(aes(x = year, y = Co2)) +
  geom_line()

Using dplyr, the following is an alternative.

Co2_country_rev2 <- Co2_country %>% mutate(year = as.integer(year))
Co2_country_rev2 %>% filter(ID == "WLD") %>% 
  ggplot(aes(x = year, y= Co2)) +
  geom_line()

It is possible to set year to be an integer variable when pivot_longer is applied by adding names_transform = list(year = as.integer).

co2_country <- country_tmp %>%
  select('Country Name', 'Country Code', 'Indicator Name', 'Indicator Code','1960':'2018')
co2_country <- pivot_longer(co2_country,cols = 5:63, names_to = "year", names_transform = list(year = as.integer), values_to = "Co2", values_drop_na = TRUE) %>%
  select(ID = 'Country Code', Country = 'Country Name', year, Co2)
DT::datatable(co2_country)
co2_country <- country_tmp %>% 
  pivot_longer(cols = 5:63, names_to = "year", names_transform = list(year = as.integer), 
               values_to = "co2", values_drop_na = TRUE) %>%
  select(id = 'Country Code', country = 'Country Name', year, co2)
DT::datatable(co2_country)
co2_country %>% filter(id %in% c("WLD", "USA", "CHN", "JPN")) %>% 
  ggplot(aes(x = year, y= co2, color = id)) +
  geom_line()

COUNTRIES <- c("China","Japan","United States","Great Britain","India","South Africa","Malaysia","Russia","Australia", "Canada", "Vietnam")
co2_country %>% filter(country %in% COUNTRIES) %>% 
  ggplot(aes(x = year, y= co2, color = country)) +
  geom_line() +
  geom_point()

Compare the data above and the those in WDI.

WDIsearch(string = "co2", field = "name") %>% DT::datatable()

6.4 Population Analysis

The original analysis uses Base R commands a lot, I do a similar analysis using tidyverse as an example.

The following population data has seven variables.

url <- "https://data.un.org/_Docs/SYB/CSV/SYB64_246_202110_Population%20Growth,%20Fertility%20and%20Mortality%20Indicators.csv"
df_un_pop <- read_csv(url, skip = 1)
## New names:
## • `` -> `...2`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 4899 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ...2, Series, Footnotes, Source
## dbl (3): Region/Country/Area, Year, Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_pop
## # A tibble: 4,899 × 7
##    `Region/Country/Area` ...2                   Year Series Value Footn…¹ Source
##                    <dbl> <chr>                 <dbl> <chr>  <dbl> <chr>   <chr> 
##  1                     1 Total, all countries…  2010 Popul…   1.2 Data r… "Unit…
##  2                     1 Total, all countries…  2010 Total…   2.6 Data r… "Unit…
##  3                     1 Total, all countries…  2010 Infan…  41   Data r… "Unit…
##  4                     1 Total, all countries…  2010 Mater… 248   <NA>    "Worl…
##  5                     1 Total, all countries…  2010 Life …  68.9 Data r… "Unit…
##  6                     1 Total, all countries…  2010 Life …  66.7 Data r… "Unit…
##  7                     1 Total, all countries…  2010 Life …  71.3 Data r… "Unit…
##  8                     1 Total, all countries…  2015 Popul…   1.2 Data r… "Unit…
##  9                     1 Total, all countries…  2015 Total…   2.5 Data r… "Unit…
## 10                     1 Total, all countries…  2015 Infan…  33.9 Data r… "Unit…
## # … with 4,889 more rows, and abbreviated variable name ¹​Footnotes

Check the regions of interst.

df_un_pop %>% filter(`Region/Country/Area` %in% c(2,9,21,419,142,150)) %>% distinct(`...2`)
## # A tibble: 6 × 1
##   ...2                         
##   <chr>                        
## 1 Africa                       
## 2 Northern America             
## 3 Latin America & the Caribbean
## 4 Asia                         
## 5 Europe                       
## 6 Oceania
df_un_pop_rev <- df_un_pop %>% 
  select(rn = `Region/Country/Area`, region = `...2`, year = Year, series = Series, value = Value) %>%
  filter(rn %in% c(2,9,21,419,142,150))
DT::datatable(df_un_pop_rev)
df_un_pop_rev %>% distinct(series)
## # A tibble: 7 × 1
##   series                                                  
##   <chr>                                                   
## 1 Population annual rate of increase (percent)            
## 2 Total fertility rate (children per women)               
## 3 Infant mortality for both sexes (per 1,000 live births) 
## 4 Life expectancy at birth for both sexes (years)         
## 5 Life expectancy at birth for males (years)              
## 6 Life expectancy at birth for females (years)            
## 7 Maternal mortality ratio (deaths per 100,000 population)
df_un_pop_rev %>% filter(series == "Population annual rate of increase (percent)") %>%
  ggplot(aes(x = year, y = value, color = region)) + 
  geom_line() +
  labs(title = "Population annual rate of increase (percent)")

df_un_pop_rev %>% filter(series == "Total fertility rate (children per women)") %>%
  ggplot(aes(x = year, y = value, color = region)) + 
  geom_line() +
  labs(title = "Total fertility rate (children per women)")

df_un_pop_rev %>% filter(series == "Infant mortality for both sexes (per 1,000 live births)") %>%
  ggplot(aes(x = year, y = value, color = region)) + 
  geom_line() +
  labs(title = "Infant mortality for both sexes (per 1,000 live births)")

6.5 Literacy rate, youth total (% of people ages 15-24)

I will introduce this because it uses semi_join to choose countries’ data deleting aggrigated data.

WDIsearch(string = "SE.ADT.1524.LT.ZS", field = "indicator", cache = NULL)
##                                             indicator 
##                                   "SE.ADT.1524.LT.ZS" 
##                                                  name 
## "Literacy rate, youth total (% of people ages 15-24)"
literacy_rate_youth <- WDI(country = "all", indicator = "SE.ADT.1524.LT.ZS")
DT::datatable(literacy_rate_youth)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

6.5.1 Filtering joins

Description

Filtering joins filter rows from x based on the presence or absence of matches in y:

  • semi_join() return all rows from x with a match in y.

  • anti_join() return all rows from x without a match in y.

  • Combine the literacy rate data with the country data

    • semi_joint: list contains countries only listed in wb_countries
    • deleted some of the aggregated data
literacy_rate_youth_country <- literacy_rate_youth %>% semi_join(wb_countries, by = "country")
DT::datatable(literacy_rate_youth_country)

Compare with the following.

literacy_rate_youth_aggregated <- literacy_rate_youth %>% anti_join(wb_countries, by = "country")
DT::datatable(literacy_rate_youth_aggregated)

6.6 Visualization of the data

6.6.1 Gender: Ratio of girls to boys in primary, secondary and tertiary levels

  • Went to http://data.un.org/ (UNdata website)
  • Scrolled down to “Gender” within “Popular statistical tables, country (area) and regional profiles”
  • Copied CSV download link for “Ratio of girls to boys in primary, secondary and tertiary levels”
  • Pasted the link below⇩
url_of_data <- "http://data.un.org/_Docs/SYB/CSV/SYB64_319_202110_Ratio%20of%20girls%20to%20boys%20in%20education.csv"
gender_education <- read_csv("http://data.un.org/_Docs/SYB/CSV/SYB64_319_202110_Ratio%20of%20girls%20to%20boys%20in%20education.csv", skip = 1)
## New names:
## Rows: 2881 Columns: 7
## ── Column specification
## ────────────────────────────────────────────────────────
## Delimiter: "," chr (4): ...2, Series, Footnotes, Source dbl (3):
## Region/Country/Area, Year, Value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
gender_education
## # A tibble: 2,881 × 7
##    `Region/Country/Area` ...2                   Year Series Value Footn…¹ Source
##                    <dbl> <chr>                 <dbl> <chr>  <dbl> <chr>   <chr> 
##  1                     1 Total, all countries…  1995 Ratio…  0.91 <NA>    Unite…
##  2                     1 Total, all countries…  2005 Ratio…  0.95 <NA>    Unite…
##  3                     1 Total, all countries…  2010 Ratio…  0.97 <NA>    Unite…
##  4                     1 Total, all countries…  2015 Ratio…  1    <NA>    Unite…
##  5                     1 Total, all countries…  2017 Ratio…  1    <NA>    Unite…
##  6                     1 Total, all countries…  2018 Ratio…  0.98 <NA>    Unite…
##  7                     1 Total, all countries…  2019 Ratio…  0.98 Estima… Unite…
##  8                     1 Total, all countries…  1995 Ratio…  0.88 <NA>    Unite…
##  9                     1 Total, all countries…  2005 Ratio…  0.95 <NA>    Unite…
## 10                     1 Total, all countries…  2010 Ratio…  0.97 <NA>    Unite…
## # … with 2,871 more rows, and abbreviated variable name ¹​Footnotes
colnames(gender_education)
## [1] "Region/Country/Area" "...2"                "Year"               
## [4] "Series"              "Value"               "Footnotes"          
## [7] "Source"

It is a good try to use pivot_wider, and it is easy to see the data. However for visualization using tidyverse it is not necessary if you use filter and/or grouping.

gender_education_tbl <- gender_education %>% select(num = "Region/Country/Area", region = "...2", year = "Year", series = "Series", value = "Value") %>% 
  pivot_wider(names_from = series, values_from = value) 
gender_education_tbl
## # A tibble: 1,173 × 6
##      num region                         year Ratio of girls to…¹ Ratio…² Ratio…³
##    <dbl> <chr>                         <dbl>               <dbl>   <dbl>   <dbl>
##  1     1 Total, all countries or areas  1995                0.91    0.88    0.95
##  2     1 Total, all countries or areas  2005                0.95    0.95    1.05
##  3     1 Total, all countries or areas  2010                0.97    0.97    1.07
##  4     1 Total, all countries or areas  2015                1       0.99    1.1 
##  5     1 Total, all countries or areas  2017                1       0.99    1.12
##  6     1 Total, all countries or areas  2018                0.98    0.99    1.12
##  7     1 Total, all countries or areas  2019                0.98    0.99    1.13
##  8    15 Northern Africa                1995                0.86    0.86    0.76
##  9    15 Northern Africa                2005                0.93    0.99    0.96
## 10    15 Northern Africa                2010                0.95    0.98    1.07
## # … with 1,163 more rows, and abbreviated variable names
## #   ¹​`Ratio of girls to boys in primary education`,
## #   ²​`Ratio of girls to boys in secondary education`,
## #   ³​`Ratio of girls to boys in tertiary education`

6.6.1.1 Visualization

The student asks: I do not know how to visualize this data using ggplot…

  1. First select one region and make a scatter plot and see the data.
gender_education %>% filter(`Region/Country/Area` == 1) %>%  
  ggplot() +
  geom_point(aes(x = Year, y = Value))

2. Since there are a few data in each year, use color and geom_line as well.

gender_education %>% filter(`Region/Country/Area` == 1) %>%  
  ggplot(aes(x = Year, y = Value, color = Series)) +
  geom_point() +
  geom_line()

3. Now we choose several countries and start analyzing what you want. Probably it is easier to see in separate charts. I wanted to introduce facet-grid.

gender_education %>% 
  select(`Region/Country/Area`, region = `...2`, year = Year, value = Value, series = Series) %>%
  mutate(year = as.integer(year)) %>%
  filter(`Region/Country/Area` %in% c(1, 15, 202, 21, 419, 143, 30, 35, 34, 145, 150, 9)) %>%  
  ggplot(aes(x = year, y = value, color = region)) +
  geom_point() +
  geom_line() +
  facet_grid(cols = vars(series))

6.6.2 Proportion of seats held by women in national parliament.

url_of_df <- "http://data.un.org/_Docs/SYB/CSV/SYB64_317_202110_Seats%20held%20by%20women%20in%20Parliament.csv"
download.file(url = url_of_df, destfile = "data/UN_WO.csv")
df_UN_WO <- read_csv("data/UN_WO.csv", skip = 1)
## New names:
## Rows: 1958 Columns: 9
## ── Column specification
## ────────────────────────────────────────────────────────
## Delimiter: "," chr (5): ...2, Series, Last Election Date, Footnotes, Source dbl
## (3): Region/Country/Area, Year, Value lgl (1): Last Election Date footnote
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
df_UN_WO
## # A tibble: 1,958 × 9
##    `Region/Country/Area` ...2   Year Series Last …¹ Last …² Value Footn…³ Source
##                    <dbl> <chr> <dbl> <chr>  <chr>   <lgl>   <dbl> <chr>   <chr> 
##  1                     1 Tota…  2000 Seats… <NA>    NA       13.3 <NA>    "Inte…
##  2                     1 Tota…  2005 Seats… <NA>    NA       15.9 <NA>    "Inte…
##  3                     1 Tota…  2010 Seats… <NA>    NA       19   <NA>    "Inte…
##  4                     1 Tota…  2015 Seats… <NA>    NA       22.3 <NA>    "Inte…
##  5                     1 Tota…  2017 Seats… <NA>    NA       23.4 <NA>    "Inte…
##  6                     1 Tota…  2018 Seats… <NA>    NA       23.4 <NA>    "Inte…
##  7                     1 Tota…  2019 Seats… <NA>    NA       24.3 <NA>    "Inte…
##  8                     1 Tota…  2020 Seats… <NA>    NA       24.9 Data a… "Inte…
##  9                     1 Tota…  2021 Seats… <NA>    NA       25.6 Data a… "Inte…
## 10                    15 Nort…  2000 Seats… <NA>    NA        5.4 <NA>    "Inte…
## # … with 1,948 more rows, and abbreviated variable names ¹​`Last Election Date`,
## #   ²​`Last Election Date footnote`, ³​Footnotes
colnames(df_UN_WO)
## [1] "Region/Country/Area"         "...2"                       
## [3] "Year"                        "Series"                     
## [5] "Last Election Date"          "Last Election Date footnote"
## [7] "Value"                       "Footnotes"                  
## [9] "Source"
UN_WO_tbl <- df_UN_WO %>% select(num = "Region/Country/Area", region = "...2", year = "Year", series = "Series", LED = "Last Election Date", value = "Value")%>%
pivot_wider (names_from = series, values_from = value) 
UN_WO_tbl
## # A tibble: 1,958 × 5
##      num region                         year LED   Seats held by women in nati…¹
##    <dbl> <chr>                         <dbl> <chr>                         <dbl>
##  1     1 Total, all countries or areas  2000 <NA>                           13.3
##  2     1 Total, all countries or areas  2005 <NA>                           15.9
##  3     1 Total, all countries or areas  2010 <NA>                           19  
##  4     1 Total, all countries or areas  2015 <NA>                           22.3
##  5     1 Total, all countries or areas  2017 <NA>                           23.4
##  6     1 Total, all countries or areas  2018 <NA>                           23.4
##  7     1 Total, all countries or areas  2019 <NA>                           24.3
##  8     1 Total, all countries or areas  2020 <NA>                           24.9
##  9     1 Total, all countries or areas  2021 <NA>                           25.6
## 10    15 Northern Africa                2000 <NA>                            5.4
## # … with 1,948 more rows, and abbreviated variable name
## #   ¹​`Seats held by women in national parliament, as of February (%)`
colnames(UN_WO_tbl) <- c("num", "region", "year", "LED", "seats")
UN_WO_tbl
## # A tibble: 1,958 × 5
##      num region                         year LED   seats
##    <dbl> <chr>                         <dbl> <chr> <dbl>
##  1     1 Total, all countries or areas  2000 <NA>   13.3
##  2     1 Total, all countries or areas  2005 <NA>   15.9
##  3     1 Total, all countries or areas  2010 <NA>   19  
##  4     1 Total, all countries or areas  2015 <NA>   22.3
##  5     1 Total, all countries or areas  2017 <NA>   23.4
##  6     1 Total, all countries or areas  2018 <NA>   23.4
##  7     1 Total, all countries or areas  2019 <NA>   24.3
##  8     1 Total, all countries or areas  2020 <NA>   24.9
##  9     1 Total, all countries or areas  2021 <NA>   25.6
## 10    15 Northern Africa                2000 <NA>    5.4
## # … with 1,948 more rows
UN_WO_tbl_short <- UN_WO_tbl %>% select(region, year, seats)
UN_WO_tbl_short
## # A tibble: 1,958 × 3
##    region                         year seats
##    <chr>                         <dbl> <dbl>
##  1 Total, all countries or areas  2000  13.3
##  2 Total, all countries or areas  2005  15.9
##  3 Total, all countries or areas  2010  19  
##  4 Total, all countries or areas  2015  22.3
##  5 Total, all countries or areas  2017  23.4
##  6 Total, all countries or areas  2018  23.4
##  7 Total, all countries or areas  2019  24.3
##  8 Total, all countries or areas  2020  24.9
##  9 Total, all countries or areas  2021  25.6
## 10 Northern Africa                2000   5.4
## # … with 1,948 more rows
UN_WO_tbl_short$year <- as.integer(UN_WO_tbl_short$year)
UN_WO_tbl_short
## # A tibble: 1,958 × 3
##    region                         year seats
##    <chr>                         <int> <dbl>
##  1 Total, all countries or areas  2000  13.3
##  2 Total, all countries or areas  2005  15.9
##  3 Total, all countries or areas  2010  19  
##  4 Total, all countries or areas  2015  22.3
##  5 Total, all countries or areas  2017  23.4
##  6 Total, all countries or areas  2018  23.4
##  7 Total, all countries or areas  2019  24.3
##  8 Total, all countries or areas  2020  24.9
##  9 Total, all countries or areas  2021  25.6
## 10 Northern Africa                2000   5.4
## # … with 1,948 more rows

6.6.2.1 Visualization

UN_WO_tbl_short %>% filter(year %in% c(2000, 2005, 2010, 2015, 2020)) %>% 
  mutate(year = as.factor(year)) %>%
  ggplot() +
  geom_freqpoly(aes(x = seats, color = year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

UN_WO_tbl_short %>% filter(year %in% c(2000, 2005, 2010, 2015, 2020)) %>% 
  mutate(year = as.factor(year)) %>%
  group_by(year) %>% count(cut_width(seats, 5))
## # A tibble: 57 × 3
## # Groups:   year [5]
##    year  `cut_width(seats, 5)`     n
##    <fct> <fct>                 <int>
##  1 2000  [-2.5,2.5]               24
##  2 2000  (2.5,7.5]                50
##  3 2000  (7.5,12.5]               65
##  4 2000  (12.5,17.5]              25
##  5 2000  (17.5,22.5]              21
##  6 2000  (22.5,27.5]               9
##  7 2000  (27.5,32.5]               6
##  8 2000  (32.5,37.5]               5
##  9 2000  (42.5,47.5]               1
## 10 2005  [-2.5,2.5]               17
## # … with 47 more rows
UN_WO_tbl_short
## # A tibble: 1,958 × 3
##    region                         year seats
##    <chr>                         <int> <dbl>
##  1 Total, all countries or areas  2000  13.3
##  2 Total, all countries or areas  2005  15.9
##  3 Total, all countries or areas  2010  19  
##  4 Total, all countries or areas  2015  22.3
##  5 Total, all countries or areas  2017  23.4
##  6 Total, all countries or areas  2018  23.4
##  7 Total, all countries or areas  2019  24.3
##  8 Total, all countries or areas  2020  24.9
##  9 Total, all countries or areas  2021  25.6
## 10 Northern Africa                2000   5.4
## # … with 1,948 more rows
wb_regions
## # A tibble: 45 × 2
##    region                                        iso3c
##    <chr>                                         <chr>
##  1 Caribbean small states                        CSS  
##  2 Central Europe and the Baltics                CEB  
##  3 Early-demographic dividend                    EAR  
##  4 East Asia & Pacific                           EAS  
##  5 East Asia & Pacific (excluding high income)   EAP  
##  6 East Asia & Pacific (IDA & IBRD)              TEA  
##  7 Euro area                                     EMU  
##  8 Europe & Central Asia                         ECS  
##  9 Europe & Central Asia (excluding high income) ECA  
## 10 Europe & Central Asia (IDA & IBRD)            TEC  
## # … with 35 more rows
UN_WO_tbl_short %>% filter(region %in% c("Total, all countries or areas", "Northern Africa", "Sub-Saharan Africa", "Eastern Africa", "Middle Africa", "Southern Africa", "Western Africa")) %>%
  ggplot() + geom_line(aes(x= year, y= seats, color= region)) + labs(title = "Seats of Women in Parliament")

Good Luck!

If you need help, please drop me a line. I would set up a Zoom Office Hour.

HS